Install the correct packages and load the libraries in. If you have not installed the tidyverse package, make sure you uncomment the below lines.
#install.packages("tidyverse")
#install.packages("lubridate")
library(tidyverse)
library(lubridate)
#load the data
#sn <- read_csv("~/Desktop/Code/pa_philadelphia_2019_02_25.csv")
sn <- read_csv("https://datajournalism.tech/pa_philadelphia_2019_02_25.csv")
Explore the dataset provided by Stanford University. See more on their website https://openpolicing.stanford.edu.
View(sn) #to view the data table
str(sn) #to see the characteristics of variables
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1891916 obs. of 19 variables:
## $ raw_row_number : num 1 2 3 4 5 6 7 8 9 10 ...
## $ date : Date, format: "2014-09-27" "2014-09-27" ...
## $ time : 'hms' num 20:05:00 19:27:00 19:27:00 20:38:00 ...
## ..- attr(*, "units")= chr "secs"
## $ location : chr "5600 BLOCK BROOMALL ST" "5200 BLOCK PENTRIDGE ST" "5200 BLOCK PENTRIDGE ST" "4900 BLOCK KINGSESSING AV" ...
## $ lat : num 39.9 39.9 39.9 39.9 40 ...
## $ lng : num -75.2 -75.2 -75.2 -75.2 -75.2 ...
## $ district : chr "12" "12" "12" "12" ...
## $ service_area : chr "124" "124" "124" "123" ...
## $ subject_age : num 24 58 31 29 35 39 49 61 43 48 ...
## $ subject_race : chr "black" "black" "black" "black" ...
## $ subject_sex : chr "male" "male" "male" "male" ...
## $ type : chr "pedestrian" "pedestrian" "pedestrian" "vehicular" ...
## $ arrest_made : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ outcome : chr NA NA NA NA ...
## $ contraband_found: logi NA NA NA NA NA NA ...
## $ frisk_performed : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ search_conducted: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ search_person : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ search_vehicle : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## - attr(*, "spec")=
## .. cols(
## .. raw_row_number = col_double(),
## .. date = col_date(format = ""),
## .. time = col_time(format = ""),
## .. location = col_character(),
## .. lat = col_double(),
## .. lng = col_double(),
## .. district = col_character(),
## .. service_area = col_character(),
## .. subject_age = col_double(),
## .. subject_race = col_character(),
## .. subject_sex = col_character(),
## .. type = col_character(),
## .. arrest_made = col_logical(),
## .. outcome = col_character(),
## .. contraband_found = col_logical(),
## .. frisk_performed = col_logical(),
## .. search_conducted = col_logical(),
## .. search_person = col_logical(),
## .. search_vehicle = col_logical()
## .. )
glimpse(sn) #to see a short summary of values in each column
## Observations: 1,891,916
## Variables: 19
## $ raw_row_number <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, …
## $ date <date> 2014-09-27, 2014-09-27, 2014-09-27, 2014-09-27…
## $ time <time> 20:05:00, 19:27:00, 19:27:00, 20:38:00, 03:00:…
## $ location <chr> "5600 BLOCK BROOMALL ST", "5200 BLOCK PENTRIDGE…
## $ lat <dbl> 39.94374, 39.94522, 39.94522, 39.94304, 39.9502…
## $ lng <dbl> -75.23279, -75.22476, -75.22476, -75.21521, -75…
## $ district <chr> "12", "12", "12", "12", "09", "09", "09", "09",…
## $ service_area <chr> "124", "124", "124", "123", "092", "091", "092"…
## $ subject_age <dbl> 24, 58, 31, 29, 35, 39, 49, 61, 43, 48, 62, 38,…
## $ subject_race <chr> "black", "black", "black", "black", "black", "b…
## $ subject_sex <chr> "male", "male", "male", "male", "male", "female…
## $ type <chr> "pedestrian", "pedestrian", "pedestrian", "vehi…
## $ arrest_made <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ outcome <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ contraband_found <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ frisk_performed <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ search_conducted <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ search_person <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ search_vehicle <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
colnames(sn) #to view column headers
## [1] "raw_row_number" "date" "time"
## [4] "location" "lat" "lng"
## [7] "district" "service_area" "subject_age"
## [10] "subject_race" "subject_sex" "type"
## [13] "arrest_made" "outcome" "contraband_found"
## [16] "frisk_performed" "search_conducted" "search_person"
## [19] "search_vehicle"
sn <- sn %>% mutate(subject_sex=as.factor(subject_sex)) #convert chr to factor levels for the variable `subject_sex`
sn <- sn %>% mutate(subject_race=as.factor(subject_race)) #convert chr to factor levels for the variable `subject_race`
After viewing the dataset, you can analyze it to see the min, max, mean, median and other values for each variable. These are called descriptive statistics.
summary(sn)
## raw_row_number date time
## Min. : 1 Min. :2014-01-01 Length:1891916
## 1st Qu.: 472980 1st Qu.:2015-02-06 Class1:hms
## Median : 945958 Median :2016-01-04 Class2:difftime
## Mean : 945958 Mean :2016-02-01 Mode :numeric
## 3rd Qu.:1418937 3rd Qu.:2017-02-07
## Max. :1891916 Max. :2018-04-14
##
## location lat lng district
## Length:1891916 Min. :39.88 Min. :-75.28 Length:1891916
## Class :character 1st Qu.:39.96 1st Qu.:-75.20 Class :character
## Mode :character Median :39.99 Median :-75.16 Mode :character
## Mean :39.99 Mean :-75.16
## 3rd Qu.:40.02 3rd Qu.:-75.13
## Max. :40.14 Max. :-74.96
## NA's :106768 NA's :106768
## service_area subject_age subject_race
## Length:1891916 Min. : 10.00 asian/pacific islander: 40531
## Class :character 1st Qu.: 24.00 black :1265616
## Mode :character Median : 31.00 hispanic : 185503
## Mean : 34.69 other/unknown : 20696
## 3rd Qu.: 44.00 white : 379570
## Max. :110.00
## NA's :4612
## subject_sex type arrest_made outcome
## female: 470421 Length:1891916 Mode :logical Length:1891916
## male :1420807 Class :character FALSE:1795709 Class :character
## NA's : 688 Mode :character TRUE :96207 Mode :character
##
##
##
##
## contraband_found frisk_performed search_conducted search_person
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:84702 FALSE:1721352 FALSE:1773689 FALSE:1794287
## TRUE :33525 TRUE :170564 TRUE :118227 TRUE :97629
## NA's :1773689
##
##
##
## search_vehicle
## Mode :logical
## FALSE:1854385
## TRUE :37531
##
##
##
##
There are some verbs that you need to memorize. See more at https://learn.r-journalism.com/en/wrangling/dplyr/dplyr/ First, the select verb helps you grab column(s) in a dataset
a <- select(sn,subject_race, subject_age, subject_sex, type, contraband_found, arrest_made) #to select the subject_race columns and assign it to object a.
#a <- sn %>% select(subject_race) #a different way to do the selection using %>%
b <- filter(a, subject_age==10)
b <- group_by(b,arrest_made) %>% summarize(value=n())
Second, the group_by verb helps you categorize your values into fewer groups. The summarize verb always goes along with the group_by to help count the number of values for each group and compute the percentage of each group over the whole population.
sex <- sn %>% group_by(subject_sex) %>%
summarize(value=n(), prop = value/nrow(.))
## Warning: Factor `subject_sex` contains implicit NA, consider using
## `forcats::fct_explicit_na`
# to count the numbers of stopped drivers for each gender and compute the percentage of column and assign it to a table called `sex`
View(sex) # to view the `sex` table
race <- sn %>% group_by(subject_race) %>%
summarize(value=n(), prop=value/nrow(.)) %>%
arrange(-value) # to count the numbers of stopped drivers for each race and compute the percentage of column and assign it to a table called `race`. Also, arrange the value in the descending order
View(race) # to view the `race` table
#Now it's your turn. Tell me how many stopped drivers were found with contraband? Or how many searches ended successfully? Write your code in the next line, without the hashtag.
We will need certain packages to be installed and called before creating our charts.
library(ggplot2)
#install.packages("devtools")
#devtools::install_github('bbc/bbplot')
library(bbplot)
To create this chart, we will use the ggplot2 package. Let’s create a simple line chart by modifying the code on the Stanford University’s Open Policing Project website. This chart displays the total numbers of drivers who got stopped by the police over the year.
line <- sn %>%
count(year = year(date), subject_race) %>%
ggplot(aes(x = year, y = n, color = subject_race)) +
geom_point() +
geom_line() +
bbc_style()
line
#data prep
barprep <- sn %>% group_by(subject_race) %>%
summarize(value=n())
#make the plot
bar <- ggplot(barprep,
aes(x=reorder(subject_race,value), y=value))+
geom_bar(stat="identity",
position="identity",
fill="red")+
geom_hline(yintercept = 0) +
bbc_style()+
labs(title="Stopped Drivers by Race",
subtitle = "African American drivers got stopped the most in the city of Philadelphia,Pensylvania")+
coord_flip()
options(scipen=10000)
bar
You can export the graphics by running the names of the objects in the Console and click the Export button under the Plots tab. Otherwise, you can use one of the following codes to export it:
ggsave("bar.png", width=40, height=20, units= "cm")
ggsave("bar.svg", width=40, height=20, units= "cm")
#ggsave only saves the last plot you created, so you may want to go up to the line chart and write ggsave("line.png")
This will be made with the ggplot2 package.
ggplot(barprep, aes(x="", y=value, fill=subject_race))+
geom_bar(width=1, stat="identity")+
coord_polar("y", start=0)
The following pie will be made with the plotly package.
#install.packages("plotly")
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
pieprep <- sn %>% group_by(subject_race) %>%
summarize(value=n()) %>%
arrange(-value)
color <- c("gray", "pink", "red", "yellow", "green")
pie <- plot_ly(data=pieprep, labels=~subject_race, values=~value, type="pie", textposition="inside",insidetextfont=list(color="white"),hoverinfo="text", text=~paste(subject_race), marker=list(colors=color, line=list(color="white", width=1)), showlegend=F) %>% layout(title="Stopped Driver by Race")
pie
Now we will make a donut. How to?
plot_ly(data=pieprep, labels=~subject_race, values=~value, showlegend=F) %>%
add_pie(hole=.5) %>%
layout(title="Stopped Drivers by Race")
#The second way of creating a donut
donut <- sn %>% group_by(subject_race) %>%
summarise(value=n()) %>%
plot_ly(labels=~subject_race, values=~value, showlegend=F) %>%
add_pie(hole=.5) %>%
layout(title="Stopped Drivers by Race")
donut
#list of packages we need
#install.packages("ggplot2")
#install.packages("ggmap")
#install.packages("maps")
#install.packages("mapdata")
#Call out the packages
library(ggplot2)
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
##
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
##
## wind
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(mapdata)
#get data for the base map
usa <- map_data("usa")
states <- map_data("state")
counties <- map_data("county")
world <- map_data("world2Hires")
#filter the year 2017
f <- filter(sn, year(date)==2017)
#filter Philadelphia, PA
philly <- filter(counties, region=="pennsylvania", subregion=="philadelphia")
#create the map
ggplot(philly) + geom_polygon(aes(x=long, y=lat, group=group))+
coord_fixed(1.3)+
geom_point(data=f, aes(x=lng, y=lat, color=subject_race), size=1)+ facet_wrap(~subject_race)
## Warning: Removed 8475 rows containing missing values (geom_point).
#install.packages("httpuv")
#install.packages("leaflet")
library(httpuv)
library(leaflet)
m <- leaflet() %>%
addTiles() %>%
setView(lng= -75.172347, lat= 39.952150, zoom=16) %>%
addMarkers(lng= -75.172347, lat= 39.952150, popup="Philadelphia, PA")
m
race <- colorFactor(c("pink", "black", "yellow", "red", "blue"), domain=c("white", "black", "asian/pacific islander", "hispanic", "other/unknown"), ordered=TRUE)
m2<- leaflet(f) %>%
addProviderTiles(providers$CartoDB) %>%
setView(lng= -75.172347, lat= 39.952150, zoom=16) %>%
addCircleMarkers(~lng, ~lat, popup=paste("This is a", f$subject_race, "and", f$subject_sex, "driver."), weight= 3, radius=4, color=~race(subject_race), stroke=F, fillOpacity=.5)
## Warning in validateCoords(lng, lat, funcName): Data contains 8475 rows with
## either missing or invalid lat/lon values and will be ignored
m2